# library preparations
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import joblib
import seaborn as sns
import time
Given that we could just ask the simulation for more samples, I decided to NOT employ a typical training-split. Ratther, I will deploy all samples for training and optimize based on cross validation approaches, given every test record a turn at being in the training set and test set. Once the model is fitted, we can then ask the simulation for some more samples (say 1000) to use as a test set completely related here.
Many metrics can be used to evaluate models, some I calculate here are:

Sometimes, in the realm of health sciences, we want to punish or reward the model for doing something really well compared to other. For example, in cancer prediction, more emphasis is implaced on avoiding False Negatives (not detecting the cancer), so that we may wish to assign costs/weights (negative means reward) to TP, FP, TN, and FN like this:

This can be implemented as other alternative to above metrics during cross validation. Or, cost matrix can be used to classify one particular record. That is, we can use cost matrix to evaluate risk
With a RandomForest, I am able to extract the probability of a sample as showing SAS or not, then say:
P(SAS) = 0.2 P(neutral, other) = 0.8
Given the above cost matrix, then when I:
load_train = np.load('./data/train.npz', allow_pickle=True)
load_test = np.load('./data/test.npz', allow_pickle=True)
X_train, y_train = load_train['X_train'], load_train['y_train']
X_test, y_test = load_test['X_test'], load_test['y_test']
samples, rows, cols = X_train.shape
print("Before flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
y_train = y_train.reshape(y_train.shape[0], )
y_test = y_test.reshape(y_test.shape[0], )
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
print("After flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
Before flattening: (6500, 460, 12) (6500, 1) (647, 460, 12) (647, 1) After flattening: (6500, 5520) (6500,) (647, 5520) (647,)
# Prepare for K fold cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
k = 10
# again, random_state for sake of reproducibility
kf = KFold(n_splits=k, random_state = 42, shuffle=True)
# default confusion matrix
# we can calculate TP, FP, FN, TN from this matrix at the end
matrix = pd.DataFrame(0, index=['True Yes', 'True No', ''],
columns=['Pred Yes', 'Pred No', ''])
start = time.time()
for train_index, test_index in kf.split(X_train):
# identify the train and test set within each fold
X_fold_train, X_fold_test = X_train[train_index], X_train[test_index]
y_fold_train, y_fold_test = y_train[train_index], y_train[test_index]
# fit the model on the training set
model = RandomForestClassifier(n_estimators=500, oob_score=True,
min_samples_leaf = 10, max_depth = 5,
min_samples_split = 100, max_leaf_nodes = 50,
max_features = 'sqrt', n_jobs=8)
model.fit(X_fold_train, y_fold_train)
# predict label on validation test set, record results
y_pred = model.predict(X_fold_test)
y_prob = model.predict_proba(X_fold_test)[:,1]
# collect metrics
m = pd.crosstab(y_pred, y_fold_test, margins=True)
m.index = ['True Yes', 'True No', '']
m.columns = ['Pred Yes', 'Pred No', '']
matrix += m
end = time.time()
print('Time taken:', end - start)
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
recall = TP / (TP + FN)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t", recall)
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average Recall: \t\t\t\t\t\t", recall)
print("Average F Measures: \t\t\t\t\t\t",
(2*precision*recall) / (precision+recall))
pred_prob = model.predict_proba(X_train)[:, 1]
print("Average AUC: \t\t\t\t\t\t\t", roc_auc_score(y_train, pred_prob))
Time taken: 92.98476338386536
Pred Yes Pred No
True Yes 1986 175 2161
True No 459 3880 4339
2445 4055 6500
Accuracy: 0.9024615384615384
Error Rate: 0.09753846153846153
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.9190189726978251
True Negative Rate (TNR) | Specificity 0.8942152569716525
False Positive Rate (FPR) is 0.10578474302834755
False Negative Rate (FNR) is 0.08098102730217492
Average Precision: 0.8122699386503067
Average Recall: 0.9190189726978251
Average F Measures: 0.8623534520191055
Average AUC: 0.9835690745097447
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# n_estimators = number of trees to bag
# Fit the model on all training samples possible
# clf = RandomForestClassifier(n_estimators=500, criterion='gini', max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None,
# random_state=None, verbose=0, warm_start=False, class_weight=None,
# ccp_alpha=0.0, max_samples=None)
# Tune: min_samples_split, max_leaf_nodes, max_depth and min_samples_leaf.
model = RandomForestClassifier(n_estimators=500, oob_score=True,
min_samples_leaf = 10, max_depth = 5,
min_samples_split = 100, max_leaf_nodes = 50,
max_features = 'sqrt', n_jobs=8)
start = time.time()
model.fit(X_train, y_train)
end = time.time()
print('Time taken:', end - start)
# save the model
joblib.dump(model, "./data/random_forest_10000.joblib")
Time taken: 9.560846090316772
['./data/random_forest_10000.joblib']
features_index = ["pi_x", "pi_y", "Pi_tot", "Fst", "Dxy", "Da", "Tajima's D on X",
"Tajima's D on Y", "Tajima's D across all samples", "SNP rel. dens. on X",
"SNP rel. dens. on Y", "SNP rel. dens. across all"]
for f, imp in zip(features_index, model.feature_importances_):
print(f'Feature {f} importance: {imp} \t\t\t\t\t\t')
features = model.feature_importances_.reshape(rows, cols).mean(axis=0)
feature_imp = pd.Series(features,index=features_index).sort_values(
ascending=False)
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.title("Important Features In Detecting SAS")
plt.show()
Feature pi_x importance: 8.164764516235056e-05 Feature pi_y importance: 0.004462907796602058 Feature Pi_tot importance: 0.00020357002901359443 Feature Fst importance: 0.00045011389855475353 Feature Dxy importance: 0.00029200785282084525 Feature Da importance: 0.0004025598252616499 Feature Tajima's D on X importance: 8.83188273904177e-05 Feature Tajima's D on Y importance: 0.000749645627311387 Feature Tajima's D across all samples importance: 0.00042798755529840704 Feature SNP rel. dens. on X importance: 0.009293695994424778 Feature SNP rel. dens. on Y importance: 0.016074740674935393 Feature SNP rel. dens. across all importance: 0.010707318979558811
# Extract single tree
estimator = model.estimators_[5]
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot',
feature_names = features_index * rows,
class_names = ["SAS", "Neutral"],
rounded = True, proportion = False,
precision = 2, filled = True)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
Below are peformance measures on the test set never touched during model building
y_pred = model.predict(X_test)
print("Accuracy on Test Set:", accuracy_score(y_test,y_pred))
print(f'Our OOB prediction of accuracy is: {model.oob_score_ * 100}%')
# show stats
matrix = pd.crosstab(y_pred, y_test, margins=True)
matrix.index = ['True Yes', 'True No', '']
matrix.columns = ['Pred Yes', 'Pred No', '']
print(matrix)
TP, FN = matrix.iloc[0, 0], matrix.iloc[0, 1]
FP, TN = matrix.iloc[1, 0], matrix.iloc[1, 1]
total = TP + FN + FP + TN
print("Accuracy: \t\t\t\t\t\t\t", (TP + TN) / total)
print("Error Rate: \t\t\t\t\t\t\t", (FN + FP) / total)
print("True Positive Rate (TPR) | Sensitivity | Recall | Coverage\t",
TP / (TP + FN))
print("True Negative Rate (TNR) | Specificity \t\t\t\t",
TN / (FP + TN))
print("False Positive Rate (FPR) is \t\t\t\t\t", FP / (FP + TN))
print("False Negative Rate (FNR) is \t\t\t\t\t", FN / (TP + FN))
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print("Average Precision: \t\t\t\t\t\t", precision)
print("Average F Measures: \t\t\t\t\t\t",
(2*recall*precision) / (recall + precision))
pred_prob = model.predict_proba(X_test)[:, 1]
print("Average AUC: \t\t\t\t\t\t\t", roc_auc_score(y_test, pred_prob))
Accuracy on Test Set: 0.5811437403400309
Our OOB prediction of accuracy is: 90.06153846153846%
Pred Yes Pred No
True Yes 45 181 226
True No 90 331 421
135 512 647
Accuracy: 0.5811437403400309
Error Rate: 0.4188562596599691
True Positive Rate (TPR) | Sensitivity | Recall | Coverage 0.19911504424778761
True Negative Rate (TNR) | Specificity 0.7862232779097387
False Positive Rate (FPR) is 0.21377672209026127
False Negative Rate (FNR) is 0.8008849557522124
Average Precision: 0.3333333333333333
Average F Measures: 0.24930747922437668
Average AUC: 0.4837528935185186
from sklearn.metrics import precision_recall_curve
# precision recall curve for training set
y_prob = model.predict_proba(X_test)[:,1]
precisions, recalls, thresholds = precision_recall_curve(
y_test, y_prob, pos_label = "Yes")
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()
plt.plot(precisions, recalls, "b--", label="Precision")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="upper left")
plt.show()
# ROC Curve
# The more the area under the curve the better our classifier
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob, pos_label = "Yes")
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr, "Random Forest")
plt.show()
from sklearn.metrics import roc_auc_score
print("The AUC is ", roc_auc_score(y_test, y_prob))
The AUC is 0.4837528935185186
Next steps:
Generating a diversity of inputs:
Transfer of knowledge:
Build the same model using alt_output (means)
Remaining issues: